@article{gantz2012digital,
  title={The digital universe in 2020: Big data, bigger digital shadows, and biggest growth in the far east},
  author={Gantz, John and Reinsel, David},
  journal={IDC iView: IDC Analyze the Future},
  year={2012}
}
@inproceedings{broder1997resemblance,
  title={On the resemblance and containment of documents},
  author={Broder, Andrei Z},
  booktitle={Compression and Complexity of Sequences 1997. Proceedings},
  pages={21--29},
  year={1997},
  organization={IEEE}
}
@article{broder1997syntactic,
  title={Syntactic clustering of the web},
  author={Broder, Andrei Z and Glassman, Steven C and Manasse, Mark S and Zweig, Geoffrey},
  journal={Computer Networks and ISDN Systems},
  volume={29},
  number={8},
  pages={1157--1166},
  year={1997},
  publisher={Elsevier}
}
@inproceedings{charikar2002similarity,
  title={Similarity estimation techniques from rounding algorithms},
  author={Charikar, Moses S},
  booktitle={Proceedings of the thiry-fourth annual ACM symposium on Theory of computing},
  pages={380--388},
  year={2002},
  organization={ACM}
}
@inproceedings{manku2007detecting,
  title={Detecting near-duplicates for web crawling},
  author={Manku, Gurmeet Singh and Jain, Arvind and Das Sarma, Anish},
  booktitle={Proceedings of the 16th international conference on World Wide Web},
  pages={141--150},
  year={2007},
  organization={ACM}
}
@inproceedings{ghemawat2003google,
  title={The Google file system},
  author={Ghemawat, Sanjay and Gobioff, Howard and Leung, Shun-Tak},
  booktitle={ACM SIGOPS Operating Systems Review},
  volume={37},
  number={5},
  pages={29--43},
  year={2003},
  organization={ACM}
}
@article{dean2008mapreduce,
  title={MapReduce: simplified data processing on large clusters},
  author={Dean, Jeffrey and Ghemawat, Sanjay},
  journal={Communications of the ACM},
  volume={51},
  number={1},
  pages={107--113},
  year={2008},
  publisher={ACM}
}
@inproceedings{liang2013fpp,
  title={Exploiting Fingerprint Prefetching to Improve the Performance of Data Deduplication},
  author={Song, Liangshan and Deng, Yuhui and Xie, Junjie},
  booktitle={Proceedings of the 15th IEEE International Conference on High Performance Computing and Communications},
  year={2013},
  publisher={IEEE}
}
@inproceedings{indyk1998approximate,
  title={Approximate nearest neighbors: towards removing the curse of dimensionality},
  author={Indyk, Piotr and Motwani, Rajeev},
  booktitle={Proceedings of the thirtieth annual ACM symposium on Theory of computing},
  pages={604--613},
  year={1998},
  organization={ACM}
}
@article{agrawal2007five,
  title={A five-year study of file-system metadata},
  author={Agrawal, Nitin and Bolosky, William J and Douceur, John R and Lorch, Jacob R},
  journal={ACM Transactions on Storage (TOS)},
  volume={3},
  number={3},
  pages={9},
  year={2007},
  publisher={ACM}
}
@article{meyer2012study,
  title={A study of practical deduplication},
  author={Meyer, Dutch T and Bolosky, William J},
  journal={ACM Transactions on Storage (TOS)},
  volume={7},
  number={4},
  pages={14},
  year={2012},
  publisher={ACM}
}
@article{bitton1983duplicate,
  title={Duplicate record elimination in large data files},
  author={Bitton, Dina and DeWitt, David J},
  journal={ACM Transactions on database systems (TODS)},
  volume={8},
  number={2},
  pages={255--265},
  year={1983},
  publisher={ACM}
}
@article{buckland1994relationship,
  title={The relationship between recall and precision},
  author={Buckland, Michael K. and Gey, Fredric C.},
  journal={JASIS},
  volume={45},
  number={1},
  pages={12--19},
  year={1994}
}
@article{powers2011evaluation,
  title={Evaluation: from precision, recall and F-measure to ROC, informedness, markedness \& correlation},
  author={Powers, David MW},
  journal={Journal of Machine Learning Technologies},
  volume={2},
  number={1},
  pages={37--63},
  year={2011}
}
@inproceedings{quinlan2002venti,
  title={Venti: A New Approach to Archival Storage.},
  author={Quinlan, Sean and Dorward, Sean},
  booktitle={FAST},
  volume={2},
  pages={89--101},
  year={2002}
}
@article{sapuntzakis2002optimizing,
  title={Optimizing the migration of virtual computers},
  author={Sapuntzakis, Constantine P and Chandra, Ramesh and Pfaff, Ben and Chow, Jim and Lam, Monica S and Rosenblum, Mendel},
  journal={ACM SIGOPS Operating Systems Review},
  volume={36},
  number={SI},
  pages={377--390},
  year={2002},
  publisher={ACM}
}
@misc{LessFS,
  title = {LessFS},
  author = {Mark Ruijter},
  howpublished = "\url{http://www.lessfs.com/wordpress/}"
}
@misc{Opendedup,
  title = {Opendedup},
  howpublished = "\url{http://opendedup.org}"
}
@inproceedings{forman2005finding,
  title={Finding similar files in large document repositories},
  author={Forman, George and Eshghi, Kave and Chiocchetti, Stephane},
  booktitle={Proceedings of the eleventh ACM SIGKDD international conference on Knowledge discovery in data mining},
  pages={394--400},
  year={2005},
  organization={ACM}
}
@inproceedings{manber1994finding,
  title={Finding Similar Files in a Large File System.},
  author={Manber, Udi and others},
  booktitle={Usenix Winter},
  volume={94},
  pages={1--10},
  year={1994}
}
@inproceedings{ouyang2002cluster,
  title={Cluster-based delta compression of a collection of files},
  author={Ouyang, Zan and Memon, Nasir and Suel, Torsten and Trendafilov, Dimitre},
  booktitle={Web Information Systems Engineering, 2002. WISE 2002. Proceedings of the Third International Conference on},
  pages={257--266},
  year={2002},
  organization={IEEE}
}
@inproceedings{douglis2003application,
  title={Application-specific Delta-encoding via Resemblance Detection.},
  author={Douglis, Fred and Iyengar, Arun},
  booktitle={USENIX Annual Technical Conference, General Track},
  pages={113--126},
  year={2003}
}
@inproceedings{baker1995finding,
  title={On finding duplication and near-duplication in large software systems},
  author={Baker, Brenda S},
  booktitle={Reverse Engineering, 1995., Proceedings of 2nd Working Conference on},
  pages={86--95},
  year={1995},
  organization={IEEE}
}
@inproceedings{shivakumar1996building,
  title={Building a scalable and accurate copy detection mechanism},
  author={Shivakumar, Narayanan and Garcia-Molina, Hector},
  booktitle={Proceedings of the first ACM international conference on Digital libraries},
  pages={160--168},
  year={1996},
  organization={ACM}
}
@inproceedings{brin1995copy,
  title={Copy detection mechanisms for digital documents},
  author={Brin, Sergey and Davis, James and Garcia-Molina, Hector},
  booktitle={ACM SIGMOD Record},
  volume={24},
  number={2},
  pages={398--409},
  year={1995},
  organization={ACM}
}
@inproceedings{law2007video,
  title={Video copy detection: a comparative study},
  author={Law-To, Julien and Chen, Li and Joly, Alexis and Laptev, Ivan and Buisson, Olivier and Gouet-Brunet, Valerie and Boujemaa, Nozha and Stentiford, Fred},
  booktitle={Proceedings of the 6th ACM international conference on Image and video retrieval},
  pages={371--378},
  year={2007},
  organization={ACM}
}
@article{teodosiu2006optimizing,
  title={Optimizing file replication over limited bandwidth networks using remote differential compression},
  author={Teodosiu, Dan and Bjorner, Nikolaj and Gurevich, Yuri and Manasse, Mark and Porkka, Joe},
  journal={Microsoft Research TR-2006-157},
  year={2006}
}
@inproceedings{muthitacharoen2001low,
  title={A low-bandwidth network file system},
  author={Muthitacharoen, Athicha and Chen, Benjie and Mazieres, David},
  booktitle={ACM SIGOPS Operating Systems Review},
  volume={35},
  number={5},
  pages={174--187},
  year={2001},
  organization={ACM}
}
@article{cox2002pastiche,
  title={Pastiche: Making backup cheap and easy},
  author={Cox, Landon P and Murray, Christopher D and Noble, Brian D},
  journal={ACM SIGOPS Operating Systems Review},
  volume={36},
  number={SI},
  pages={285--298},
  year={2002},
  publisher={ACM}
}
@inproceedings{suel2004improved,
  title={Improved file synchronization techniques for maintaining large replicated collections over slow networks},
  author={Suel, Torsten and Noel, Patrick and Trendafilov, Dimitre},
  booktitle={Data Engineering, 2004. Proceedings. 20th International Conference on},
  pages={153--164},
  year={2004},
  organization={IEEE}
}
@article{hua2013data,
  title={Data Similarity-aware Computation Infrastructure for the Cloud},
  author={Hua, Yu and Liu, Xue and Feng, Dan},
  journal={IEEE Transactions on Computers},
  pages={1},
  year={2013},
  publisher={IEEE}
}
@inproceedings{biswas2009multi,
  title={Multi-execution: multicore caching for data-similar executions},
  author={Biswas, Susmit and Franklin, Diana and Savage, Alan and Dixon, Ryan and Sherwood, Timothy and Chong, Frederic T},
  booktitle={ACM SIGARCH Computer Architecture News},
  volume={37},
  number={3},
  pages={164--173},
  year={2009},
  organization={ACM}
}
@misc{ClusterAnalysis,
  title = {Cluster analysis},
  howpublished = "\url{http://en.wikipedia.org/wiki/Cluster_analysis}"
}
@inproceedings{zhu2008avoiding,
  title={Avoiding the Disk Bottleneck in the Data Domain Deduplication File System.},
  author={Zhu, Benjamin and Li, Kai and Patterson, R Hugo},
  booktitle={Fast},
  volume={8},
  pages={1--14},
  year={2008}
}
@inproceedings{xia2011silo,
  title={Silo: a similarity-locality based near-exact deduplication scheme with low ram overhead and high throughput},
  author={Xia, Wen and Jiang, Hong and Feng, Dan and Hua, Yu},
  booktitle={Proceedings of the 2011 USENIX conference on USENIX annual technical conference},
  pages={26--28},
  year={2011},
  organization={USENIX Association}
}
@article{hua2013data,
  title={Data Similarity-aware Computation Infrastructure for the Cloud},
  author={Hua, Yu and Liu, Xue and Feng, Dan},
  journal={IEEE Transactions on Computers},
  pages={1},
  year={2013},
  publisher={IEEE}
}
@inproceedings{biswas2009multi,
  title={Multi-execution: multicore caching for data-similar executions},
  author={Biswas, Susmit and Franklin, Diana and Savage, Alan and Dixon, Ryan and Sherwood, Timothy and Chong, Frederic T},
  booktitle={ACM SIGARCH Computer Architecture News},
  volume={37},
  number={3},
  pages={164--173},
  year={2009},
  organization={ACM}
}
@misc{TokyoCabinet,
  title = {Tokyo Cabinet},
  author = {FAL Labs},
  howpublished = "\url{http://fallabs.com/tokyocabinet/}"
}
@article{gantz2010digital,
  title={The digital universe decade-are you ready},
  author={Gantz, John and Reinsel, David},
  journal={IDC iView},
  year={2010}
}
@article{biggar2007experiencing,
  title={Experiencing data de-duplication: Improving efficiency and reducing capacity requirements},
  author={Biggar, Heidi},
  journal={The Enterprise Strategy Group},
  year={2007}
}
@inproceedings{guo2011building,
  title={Building a highperformance deduplication system},
  author={Guo, Fanglu and Efstathopoulos, Petros},
  booktitle={Proceedings of the 2011 USENIX conference on USENIX annual technical conference},
  pages={25--25},
  year={2011},
  organization={USENIX Association}
}
@inproceedings{jain2005taper,
  title={TAPER: Tiered Approach for Eliminating Redundancy in Replica Synchronization.},
  author={Jain, Navendu and Dahlin, Michael and Tewari, Renu},
  booktitle={FAST},
  volume={5},
  pages={21--21},
  year={2005}
}
@article{adya2002farsite,
  title={FARSITE: Federated, available, and reliable storage for an incompletely trusted environment},
  author={Adya, Atul and Bolosky, William J and Castro, Miguel and Cermak, Gerald and Chaiken, Ronnie and Douceur, John R and Howell, Jon and Lorch, Jacob R and Theimer, Marvin and Wattenhofer, Roger P},
  journal={ACM SIGOPS Operating Systems Review},
  volume={36},
  number={SI},
  pages={1--14},
  year={2002},
  publisher={ACM}
}
@inproceedings{bolosky2000single,
  title={Single instance storage in Windows 2000},
  author={Bolosky, William J and Corbin, Scott and Goebel, David and Douceur, John R},
  booktitle={Proceedings of the 4th USENIX Windows Systems Symposium},
  pages={13--24},
  year={2000},
  organization={Seattle, WA}
}
@article{eshghi2005framework,
  title={A framework for analyzing and improving content-based chunking algorithms},
  author={Eshghi, Kave and Tang, Hsiu Khuern},
  journal={Hewlett-Packard Labs Technical Report TR},
  volume={30},
  pages={2005},
  year={2005}
}
@inproceedings{policroniades2004alternatives,
  title={Alternatives for Detecting Redundancy in Storage Systems Data.},
  author={Policroniades, Calicrates and Pratt, Ian},
  booktitle={USENIX Annual Technical Conference, General Track},
  pages={73--86},
  year={2004}
}
@inproceedings{dutch2008understanding,
  title={Understanding data deduplication ratios},
  author={Dutch, Mike},
  booktitle={SNIA Data Management Forum},
  year={2008}
}
@inproceedings{kulkarni2004redundancy,
  title={Redundancy Elimination Within Large Collections of Files.},
  author={Kulkarni, Purushottam and Douglis, Fred and LaVoie, Jason D and Tracey, John M},
  booktitle={USENIX Annual Technical Conference, General Track},
  pages={59--72},
  year={2004}
}
@inproceedings{tan2012reducing,
  title={Reducing the De-linearization of Data Placement to Improve Deduplication Performance},
  author={Tan, Yujuan and Yan, Zhichao and Feng, Dan and Sha, EH-M and Ge, Xiongzi},
  booktitle={High Performance Computing, Networking, Storage and Analysis (SCC), 2012 SC Companion:},
  pages={796--800},
  year={2012},
  organization={IEEE}
}
@inproceedings{YongtaoZhouPAS,
  title={Identifying file similarity in large data sets by modulo file length},
  author={Zhou, Yongtao and Deng, Yuhui and Chen, Xiaoguang and Xie, Junjie},
  booktitle={Proceedings of the 14th International Conference on Algorithms and Architectures for Parallel Processing},
  year={2014},
  publisher={IEEE}
}
@inproceedings{lillibridge2009sparse,
  title={Sparse Indexing: Large Scale, Inline Deduplication Using Sampling and Locality.},
  author={Lillibridge, Mark and Eshghi, Kave and Bhagwat, Deepavali and Deolalikar, Vinay and Trezis, Greg and Camble, Peter},
  booktitle={Fast},
  volume={9},
  pages={111--123},
  year={2009}
}
@inproceedings{bhagwat2009extreme,
  title={Extreme binning: Scalable, parallel deduplication for chunk-based file backup},
  author={Bhagwat, Deepavali and Eshghi, Kave and Long, Darrell DE and Lillibridge, Mark},
  booktitle={Modeling, Analysis \& Simulation of Computer and Telecommunication Systems, 2009. MASCOTS'09. IEEE International Symposium on},
  pages={1--9},
  year={2009},
  organization={IEEE}
}
@article{broder2000min,
  title={Min-wise independent permutations},
  author={Broder, Andrei Z and Charikar, Moses and Frieze, Alan M and Mitzenmacher, Michael},
  journal={Journal of Computer and System Sciences},
  volume={60},
  number={3},
  pages={630--659},
  year={2000},
  publisher={Elsevier}
}
@incollection{fu2012scalable,
  title={A scalable inline cluster deduplication framework for big data protection},
  author={Fu, Yinjin and Jiang, Hong and Xiao, Nong},
  booktitle={Middleware 2012},
  pages={354--373},
  year={2012},
  publisher={Springer}
}
@inproceedings{debnath2010chunkstash,
  title={ChunkStash: speeding up inline storage deduplication using flash memory},
  author={Debnath, Biplob and Sengupta, Sudipta and Li, Jin},
  booktitle={Proceedings of the 2010 USENIX conference on USENIX annual technical conference},
  pages={16--16},
  year={2010},
  organization={USENIX Association}
}
@inproceedings{meister2010dedupv1,
  title={dedupv1: Improving deduplication throughput using solid state drives (SSD)},
  author={Meister, Dirk and Brinkmann, Andre},
  booktitle={Mass Storage Systems and Technologies (MSST), 2010 IEEE 26th Symposium on},
  pages={1--6},
  year={2010},
  organization={IEEE}
}
@techreport{denehy2003duplicate,
  title={Duplicate management for reference data},
  author={Denehy, Timothy E and Hsu, Windsor W},
  year={2003},
  institution={Research Report RJ10305, IBM}
}
@inproceedings{quinlan2002venti,
  title={Venti: A New Approach to Archival Storage.},
  author={Quinlan, Sean and Dorward, Sean},
  booktitle={FAST},
  volume={2},
  pages={89--101},
  year={2002}
}
@article{deng2011future,
  title={What is the future of disk drives, death or rebirth?},
  author={Deng, Yuhui},
  journal={ACM Computing Surveys (CSUR)},
  volume={43},
  number={3},
  pages={23},
  year={2011},
  publisher={ACM}
}
@article{deng2008exploring,
  title={Exploring the performance impact of stripe size on network attached storage systems},
  author={Deng, Yuhui and Wang, Frank},
  journal={Journal of Systems Architecture},
  volume={54},
  number={8},
  pages={787--796},
  year={2008},
  publisher={Elsevier}
}
@inproceedings{lu2013extending,
  title={Extending the lifetime of flash-based storage through reducing write amplification from file systems.},
  author={Lu, Youyou and Shu, Jiwu and Zheng, Weimin and others},
  booktitle={FAST},
  pages={257--270},
  year={2013}
}
@article{hsu2004performance,
  title={The performance impact of I/O optimizations and disk improvements},
  author={Hsu, Windsor W and Smith, Alan Jay},
  journal={IBM Journal of Research and Development},
  volume={48},
  number={2},
  pages={255--289},
  year={2004},
  publisher={IBM}
}
@inproceedings{zhou2014Leverage,
  title={Leverage Similarity and Locality to Enhance Fingerprint Prefetching of Data Deduplication},
  author={Zhou, Yongtao and Deng, Yuhui and Xie, Junjie},
  booktitle={Proceedings of The 20th IEEE International Conference on Parallel and Distributed Systems},
  year={2014},
  publisher={Springer}
}
@article{akyurek1995adaptive,
  title={Adaptive block rearrangement},
  author={Aky{\"u}rek, Sedat and Salem, Kenneth},
  journal={ACM Transactions on Computer Systems (TOCS)},
  volume={13},
  number={2},
  pages={89--121},
  year={1995},
  publisher={ACM}
}
@article{bobbarjung2006improving,
  title={Improving duplicate elimination in storage systems},
  author={Bobbarjung, Deepak R and Jagannathan, Suresh and Dubnicki, Cezary},
  journal={ACM Transactions on Storage (TOS)},
  volume={2},
  number={4},
  pages={424--448},
  year={2006},
  publisher={ACM}
}
@incollection{moreton2002storage,
  title={Storage, mutability and naming in pasta},
  author={Moreton, Tim D and Pratt, Ian A and Harris, Timothy L},
  booktitle={Web Engineering and Peer-to-Peer Computing},
  pages={215--219},
  year={2002},
  publisher={Springer}
}
@inproceedings{tan2011cabdedupe,
  title={CABdedupe: A causality-based deduplication performance booster for cloud backup services},
  author={Tan, Yujuan and Jiang, Hong and Feng, Dan and Tian, Lei and Yan, Zhichao},
  booktitle={Parallel \& Distributed Processing Symposium (IPDPS), 2011 IEEE International},
  pages={1266--1277},
  year={2011},
  organization={IEEE}
}
@inproceedings{kruus2010bimodal,
  title={Bimodal Content Defined Chunking for Backup Streams.},
  author={Kruus, Erik and Ungureanu, Cristian and Dubnicki, Cezary},
  booktitle={FAST},
  pages={239--252},
  year={2010}
}
@article{kirsch2009more,
  title={More robust hashing: Cuckoo hashing with a stash},
  author={Kirsch, Adam and Mitzenmacher, Michael and Wieder, Udi},
  journal={SIAM Journal on Computing},
  volume={39},
  number={4},
  pages={1543--1561},
  year={2009},
  publisher={SIAM}
}
@article{pagh2004cuckoo,
  title={Cuckoo hashing},
  author={Pagh, Rasmus and Rodler, Flemming Friche},
  journal={Journal of Algorithms},
  volume={51},
  number={2},
  pages={122--144},
  year={2004},
  publisher={Elsevier}
}
@inproceedings{anand2010cheap,
  title={Cheap and Large CAMs for High Performance Data-Intensive Networked Systems.},
  author={Anand, Ashok and Muthukrishnan, Chitra and Kappes, Steven and Akella, Aditya and Nath, Suman},
  booktitle={NSDI},
  volume={10},
  pages={29--29},
  year={2010}
}
@article{debnath2010flashstore,
  title={FlashStore: high throughput persistent key-value store},
  author={Debnath, Biplob and Sengupta, Sudipta and Li, Jin},
  journal={Proceedings of the VLDB Endowment},
  volume={3},
  number={1-2},
  pages={1414--1425},
  year={2010},
  publisher={VLDB Endowment}
}
@inproceedings{debnath2011skimpystash,
  title={SkimpyStash: RAM space skimpy key-value store on flash-based storage},
  author={Debnath, Biplob and Sengupta, Sudipta and Li, Jin},
  booktitle={Proceedings of the 2011 ACM SIGMOD International Conference on Management of data},
  pages={25--36},
  year={2011},
  organization={ACM}
}
@inproceedings{lim2011silt,
  title={SILT: A memory-efficient, high-performance key-value store},
  author={Lim, Hyeontaek and Fan, Bin and Andersen, David G and Kaminsky, Michael},
  booktitle={Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles},
  pages={1--13},
  year={2011},
  organization={ACM}
}
@inproceedings{lu2012bloomstore,
  title={BloomStore: Bloom-filter based memory-efficient key-value store for indexing of data deduplication on flash},
  author={Lu, Guanlin and Nam, Young Jin and Du, David HC},
  booktitle={Mass Storage Systems and Technologies (MSST), 2012 IEEE 28th Symposium on},
  pages={1--11},
  year={2012},
  organization={IEEE}
}
@inproceedings{henson2003analysis,
  title={An Analysis of Compare-by-hash.},
  author={Henson, Val},
  booktitle={HotOS},
  pages={13--18},
  year={2003}
}
@inproceedings{aronovich2009design,
  title={The design of a similarity based deduplication system},
  author={Aronovich, Lior and Asher, Ron and Bachmat, Eitan and Bitner, Haim and Hirsch, Michael and Klein, Shmuel T},
  booktitle={Proceedings of SYSTOR 2009: The Israeli Experimental Systems Conference},
  pages={6},
  year={2009},
  organization={ACM}
}
@article{bloom1970space,
  title={Space/time trade-offs in hash coding with allowable errors},
  author={Bloom, Burton H},
  journal={Communications of the ACM},
  volume={13},
  number={7},
  pages={422--426},
  year={1970},
  publisher={ACM}
}
@article{fan2000summary,
  title={Summary cache: a scalable wide-area web cache sharing protocol},
  author={Fan, Li and Cao, Pei and Almeida, Jussara and Broder, Andrei Z},
  journal={IEEE/ACM Transactions on Networking (TON)},
  volume={8},
  number={3},
  pages={281--293},
  year={2000},
  publisher={IEEE Press}
}
@inproceedings{dubnicki2009hydrastor,
  title={HYDRAstor: A Scalable Secondary Storage.},
  author={Dubnicki, Cezary and Gryz, Leszek and Heldt, Lukasz and Kaczmarczyk, Michal and Kilian, Wojciech and Strzelczak, Przemyslaw and Szczepkowski, Jerzy and Ungureanu, Cristian and Welnicki, Michal},
  booktitle={FAST},
  volume={9},
  pages={197--210},
  year={2009}
}
@inproceedings{lu2012bloomstore,
  title={BloomStore: Bloom-filter based memory-efficient key-value store for indexing of data deduplication on flash},
  author={Lu, Guanlin and Nam, Young Jin and Du, David HC},
  booktitle={Mass Storage Systems and Technologies (MSST), 2012 IEEE 28th Symposium on},
  pages={1--11},
  year={2012},
  organization={IEEE}
}
@article{min2011efficient,
  title={Efficient deduplication techniques for modern backup operation},
  author={Min, Jaehong and Yoon, Daeyoung and Won, Youjip},
  journal={Computers, IEEE Transactions on},
  volume={60},
  number={6},
  pages={824--840},
  year={2011},
  publisher={IEEE}
}
@inproceedings{wei2010mad2,
  title={MAD2: A scalable high-throughput exact deduplication approach for network backup services},
  author={Wei, Jiansheng and Jiang, Hong and Zhou, Ke and Feng, Dan},
  booktitle={Mass Storage Systems and Technologies (MSST), 2010 IEEE 26th Symposium on},
  pages={1--14},
  year={2010},
  organization={IEEE}
}
@article{zhouleverage,
  title={Leverage Similarity and Locality to Enhance Fingerprint Prefetching of Data Deduplication},
  author={Zhou, Yongtao and Deng, Yuhui and Xie, Junjie}
}
@inproceedings{fu2014accelerating,
  title={Accelerating restore and garbage collection in deduplication-based backup systems via exploiting historical information},
  author={Fu, Min and Feng, Dan and Hua, Yu and He, Xubin and Chen, Zuoning and Xia, Wen and Huang, Fangting and Liu, Qing},
  booktitle={2014 USENIX Annual Technical Conference (USENIX ATC 14)},
  pages={181--192},
  year={2014},
  organization={USENIX Association}
}
@inproceedings{lillibridge2013improving,
  title={Improving restore speed for backup systems that use inline chunk-based deduplication.},
  author={Lillibridge, Mark and Eshghi, Kave and Bhagwat, Deepavali},
  booktitle={FAST},
  pages={183--198},
  year={2013}
}
@inproceedings{meister2013block,
  title={Block locality caching for data deduplication},
  author={Meister, Dirk and Kaiser, J{\"u}rgen and Brinkmann, Andr{\'e}},
  booktitle={Proceedings of the 6th International Systems and Storage Conference},
  pages={15},
  year={2013},
  organization={ACM}
}
@article{xu2013yurubackup,
  title={Yurubackup: A space-efficient and highly scalable incremental backup system in the cloud},
  author={Xu, Quanqing and Zhao, Liang and Xiao, Mingzhong and Liu, Anna and Dai, Yafei},
  journal={International Journal of Parallel Programming},
  pages={1--23},
  year={2013},
  publisher={Springer}
}
@article{teodosiu2006optimizing,
  title={Optimizing file replication over limited bandwidth networks using remote differential compression},
  author={Teodosiu, Dan and Bjorner, Nikolaj and Gurevich, Yuri and Manasse, Mark and Porkka, Joe},
  journal={Microsoft Research TR-2006-157},
  year={2006}
}
@inproceedings{nam2012assuring,
  title={Assuring demanded read performance of data deduplication storage with backup datasets},
  author={Nam, Young Jin and Park, Dongchul and Du, David HC},
  booktitle={Modeling, Analysis \& Simulation of Computer and Telecommunication Systems (MASCOTS), 2012 IEEE 20th International Symposium on},
  pages={201--208},
  year={2012},
  organization={IEEE}
}
@inproceedings{kaczmarczyk2012reducing,
  title={Reducing impact of data fragmentation caused by in-line deduplication},
  author={Kaczmarczyk, Michal and Barczynski, Marcin and Kilian, Wojciech and Dubnicki, Cezary},
  booktitle={Proceedings of the 5th Annual International Systems and Storage Conference},
  pages={15},
  year={2012},
  organization={ACM}
}
@inproceedings{srinivasan2012idedup,
  title={iDedup: latency-aware, inline data deduplication for primary storage.},
  author={Srinivasan, Kiran and Bisson, Timothy and Goodson, Garth R and Voruganti, Kaladhar},
  booktitle={FAST},
  volume={12},
  pages={1--14},
  year={2012}
}
@article{alvarez2011netapp,
  title={NetApp deduplication for FAS and V-Series deployment and implementation guide},
  author={Alvarez, Carlos},
  journal={Technical ReportTR-3505},
  year={2011}
}
@article{StorageTank2002IBM,
  title={IBM white paper: IBM Storage Tank - A distribute storage system},
  author={IBM Corporation},
  year={Jan, 2002}
}
@inproceedings {Min2015Design,
author = {Min Fu and Dan Feng and Yu Hua and Xubin He and Zuoning Chen and Wen Xia and Yucheng Zhang and Yujuan Tan},
title = {Design Tradeoffs for Data Deduplication Performance in Backup Workloads},
booktitle = {13th USENIX Conference on File and Storage Technologies (FAST 15)},
year = {2015},
month = Feb,
isbn = {978-1-931971-201},
address = {Santa Clara, CA},
pages = {331--344},
url = {https://www.usenix.org/conference/fast15/technical-sessions/presentation/fu},
publisher = {USENIX Association},
}
@inproceedings{eshghi2007jumbo,
  title={Jumbo Store: Providing Efficient Incremental Upload and Versioning for a Utility Rendering Service.},
  author={Eshghi, Kave and Lillibridge, Mark and Wilcock, Lawrence and Belrose, Guillaume and Hawkes, Rycharde},
  booktitle={FAST},
  volume={7},
  pages={123--138},
  year={2007}
}
@inproceedings{lokeshwari2011optimized,
  title={Optimized cloud storage with high throughput deduplication approach},
  author={Lokeshwari, YV and Prabavathy, B and Babu, Chitra},
  booktitle={Proceedings of the International Conference on Emerging Technology Trends (ICETT)},
  year={2011}
}
@inproceedings{shilane2012delta,
  title={Delta compressed and deduplicated storage using stream-informed locality},
  author={Shilane, Philip and Wallace, Grant and Huang, Mark and Hsu, Windsor},
  booktitle={Proceedings of the 4th USENIX conference on Hot Topics in Storage and File Systems},
  pages={10--10},
  year={2012},
  organization={USENIX Association}
}
@inproceedings{ungureanu2010hydrafs,
  title={HydraFS: A High-Throughput File System for the HYDRAstor Content-Addressable Storage System.},
  author={Ungureanu, Cristian and Atkin, Benjamin and Aranya, Akshat and Gokhale, Salil and Rago, Stephen and Calkowski, Grzegorz and Dubnicki, Cezary and Bohra, Aniruddha},
  booktitle={FAST},
  pages={225--238},
  year={2010}
}
@inproceedings{dong2011tradeoffs,
  title={Tradeoffs in Scalable Data Routing for Deduplication Clusters.},
  author={Dong, Wei and Douglis, Fred and Li, Kai and Patterson, R Hugo and Reddy, Sazzala and Shilane, Philip},
  booktitle={FAST},
  volume={11},
  pages={15--29},
  year={2011}
}
@incollection{paulo2012dedisbench,
  title={DEDISbench: A benchmark for deduplicated storage systems},
  author={Paulo, Jo{\~a}o and Reis, Pedro and Pereira, Jose and Sousa, Antonio},
  booktitle={On the Move to Meaningful Internet Systems: OTM 2012},
  pages={584--601},
  year={2012},
  publisher={Springer}
}
@inproceedings{xie2013estimating,
  title={Estimating duplication by content-based sampling},
  author={Xie, Fei and Condict, Michael and Shete, Sandip},
  booktitle={Proceedings of the 2013 USENIX conference on Annual Technical Conference},
  pages={181--186},
  year={2013},
  organization={USENIX Association}
}
@misc{IOzone,
  title = {IOzone},
  author = {William D.Norcott},
  howpublished = "\url{http://www.iozone.org/}"
}
